{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "bd0215ae",
   "metadata": {},
   "source": [
    "#### Search for Text in Word Documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "4636c31a",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['C:/Local/test.docx']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def search_word_files(directory, keyword):\n",
    "    import os\n",
    "    files_found = []\n",
    "    from docx import Document\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            if file.endswith(\".docx\") or file.endswith(\".doc\"):\n",
    "                filepath = os.path.join(root, file)\n",
    "                document = Document(filepath)\n",
    "                for paragraph in document.paragraphs:\n",
    "                    if keyword in paragraph.text:\n",
    "                        files_found.append(filepath)\n",
    "    return files_found\n",
    "\n",
    "# Example usage\n",
    "directory = \"C:/Local/\"\n",
    "keyword = \"kdeplot\"\n",
    "search_word_files(directory,keyword)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c2300f2a",
   "metadata": {},
   "source": [
    "#### Search for Text in Jupyter Notebooks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "786dafe4",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['C:/Local\\\\PythonDataBasics_Univariate_Visualization.ipynb',\n",
       " 'C:/Local\\\\1-Day_SpatialDataAnalytics\\\\Workflows\\\\bootstrap.ipynb',\n",
       " 'C:/Local\\\\1-Day_SpatialDataAnalytics\\\\Workflows\\\\bootstrap_demo.ipynb',\n",
       " 'C:/Local\\\\1-Day_SpatialDataAnalytics\\\\Workflows\\\\model_checking.ipynb']"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def search_ipynb_files(directory,keyword):\n",
    "    import glob\n",
    "    # set filepath to search\n",
    "    path = directory + '**/*.ipynb'\n",
    "    files_found = []\n",
    "    # looping through all the filenames returned\n",
    "    # set recursive = True to look in sub-directories too\n",
    "    for filename in glob.iglob(path, recursive=True):\n",
    "        # adding error handling just in case!\n",
    "        try:\n",
    "            with open(filename) as f:\n",
    "                # read the file as a string\n",
    "                contents = f.read()\n",
    "                # if the search term is found append to the list of files\n",
    "                if(keyword in contents):\n",
    "                    files_found.append(filename)\n",
    "        except:\n",
    "            pass\n",
    "    return files_found\n",
    "   \n",
    "directory = \"C:/Local/\"\n",
    "keyword = \"kdeplot\"\n",
    "search_ipynb_files(directory,keyword)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4767ac91",
   "metadata": {},
   "source": [
    "#### Search for Text in PowerPoint Slide Decks"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "df8d8998",
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['C:/Local/1-Day_SpatialDataAnalytics\\\\DataPrep.pptx',\n",
       " 'C:/Local/1-Day_SpatialDataAnalytics\\\\DataPrep.pptx',\n",
       " 'C:/Local/1-Day_SpatialDataAnalytics\\\\DataPrep.pptx',\n",
       " 'C:/Local/1Day_ExecutiveCourse\\\\02_Feature_Engineering.pptx',\n",
       " 'C:/Local/1Day_ExecutiveCourse\\\\02_Feature_Engineering.pptx',\n",
       " 'C:/Local/1Day_ExecutiveCourse\\\\OriginalPPTX\\\\02_Inference.pptx',\n",
       " 'C:/Local/1Day_ExecutiveCourse\\\\OriginalPPTX\\\\02_Inference.pptx']"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import os\n",
    "from pptx import Presentation\n",
    "\n",
    "def search_ppt_files(directory, keyword):\n",
    "    files_found = []\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            if file.endswith(\".pptx\") or file.endswith(\".ppt\"):\n",
    "                found = False\n",
    "                filepath = os.path.join(root, file)\n",
    "                presentation = Presentation(filepath)\n",
    "                for slide in presentation.slides:\n",
    "                    for shape in slide.shapes:\n",
    "                        if shape.has_text_frame:\n",
    "                            for paragraph in shape.text_frame.paragraphs:\n",
    "                                for run in paragraph.runs:\n",
    "                                    if keyword in run.text:\n",
    "                                        files_found.append(filepath)\n",
    "    return files_found\n",
    "                                        \n",
    "# Example usage\n",
    "directory = \"C:/Local/\"\n",
    "keyword = \"decluster\"\n",
    "search_ppt_files(directory, keyword)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8c23a39",
   "metadata": {},
   "source": [
    "#### Search for Text in Excel Spreadsheets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "id": "56852512",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['C:/Local/test.xlsx']"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def search_excel_files(directory, keyword):\n",
    "    files_found = []\n",
    "    import os\n",
    "    from openpyxl import load_workbook\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            if file.endswith(\".xlsx\") or file.endswith(\".xls\"):\n",
    "                filepath = os.path.join(root, file)\n",
    "                workbook = load_workbook(filepath)\n",
    "                for sheet_name in workbook.sheetnames:\n",
    "                    sheet = workbook[sheet_name]\n",
    "                    for row in sheet.iter_rows(values_only=True):\n",
    "                        for cell in row:\n",
    "                            if isinstance(cell, str) and keyword in cell:\n",
    "                                files_found.append(filepath)\n",
    "    return files_found\n",
    "\n",
    "# Example usage\n",
    "directory = \"C:/Local/\"\n",
    "keyword = \"kdeplot\"\n",
    "search_excel_files(directory,keyword)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f50193c9",
   "metadata": {},
   "source": [
    "#### Search for Text in PDF files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "id": "997e4b99",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['C:/Local/test.pdf']"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def search_pdf_files(directory, keyword):\n",
    "    files_found = []\n",
    "    import os\n",
    "    import PyPDF2\n",
    "    for root, dirs, files in os.walk(directory):\n",
    "        for file in files:\n",
    "            if file.endswith(\".pdf\"):\n",
    "                filepath = os.path.join(root, file)\n",
    "                with open(filepath, \"rb\") as pdf_file:\n",
    "                    reader = PyPDF2.PdfReader(pdf_file)\n",
    "                    for page in reader.pages:\n",
    "                        text = page.extract_text()\n",
    "                        if keyword in text:\n",
    "                            files_found.append(filepath)\n",
    "    return files_found\n",
    "\n",
    "# Example usage\n",
    "directory = \"C:/Local/\"\n",
    "keyword = \"kdeplot\"\n",
    "search_pdf_files(directory,keyword)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e3e9879",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
